
import os
import json
import numpy as np
import pandas as pd
import networkx as nx

from math import sqrt


ROOM_NAMES = ["dig", "office", "apartment", "home", "theater", "agency"]
LOCATIONS = ["Boston", "Arizona", "NYC", "Texas"]

########################################################################################################################

def extract_room_name(album_name):
    room_names = set(ROOM_NAMES)

    # check the intersection between the album name and the room name
    tokens = set(album_name.lower().replace("-", " ").replace(":", " ").split())
    intersection = room_names & tokens

    if len(intersection) != 1:
        return "NA"

    return intersection.pop()

########################################################################################################################

def face_closeness(faces):
    face_positions = [face[u'position'][u'center'] for face in faces]

    G = nx.Graph()
    total_sum = 0.0

    for i, p_i in enumerate(face_positions):
        for j, p_j in enumerate(face_positions):
            # compute euclidian distance
            d = sqrt((p_i[u'x'] - p_j[u'x'])**2 + (p_i[u'y'] - p_j[u'y'])**2)
            G.add_edge(i, j, weight=d)
            total_sum += d

    # compute a minimum spanning tree
    mst = nx.minimum_spanning_tree(G, weight='weight')
    mst_weights = [attributes['weight'] for i, j, attributes in mst.edges(data=True)]

    # aggregate
    n_ppl = float(len(faces))
    all_pairs_avg = total_sum / n_ppl
    mst_avg = sum(mst_weights) / n_ppl

    return all_pairs_avg, mst_avg

########################################################################################################################

def entropy(counts):
    base = len(counts)
    counts = np.array(counts, dtype="f")
    # NB: ignore zeros
    counts = counts[np.nonzero(counts)]
    p = counts / np.sum(counts)
    entropy = -np.sum(p * (np.log2(p) / np.log2(base)))
    return entropy

########################################################################################################################

def factor_to_one_hot(instance_dict, feature_name, feature_values):
    feature_name = feature_name.lower()
    feature_values = [f.lower() for f in feature_values]

    for feature_value in feature_values:
        one_feature_value_name = "%s_%s" % (feature_name, feature_value)

        if instance_dict[feature_name] == feature_value:
            instance_dict[one_feature_value_name] = 1.0
        else:
            instance_dict[one_feature_value_name] = 0.0

    return instance_dict

########################################################################################################################

#
# MAIN
#
def main():
    photo_dicts = []
    fets = set()

    for location in LOCATIONS:
    	base_dir = "../data/%s" % location
    	loc_annotations = {}

    	#
    	# PASS 1: read image ids and parse the room name
    	#
    	all_url_path = "%s/all_photo_urls.txt" % base_dir

    	for line in open(all_url_path):
    		parts = line.split("||")
    		file_id = parts[3].strip()
    		album_name = parts[2].strip()

    		room_name = extract_room_name(album_name)
    		loc_annotations[file_id] = {
                "file_id": file_id,
                "location": location.lower(),
                "room": room_name
            }

    	#
    	# PASS 2: go through the json files and extract the other features
    	#
    	n_photos = 0
    	for photo_id, photo_dict in loc_annotations.items():
            if (n_photos % 100) == 0: print (n_photos, location)
            fpath = "%s/annotations/%s.json" % (base_dir, photo_id)

            # check if the file exists
            if os.path.isfile(fpath) == False:
                del loc_annotations[photo_id]
                continue

            # load
            photo_json = json.load(open(fpath))
            faces = photo_json["face"]

            # skip photos in which no faces were detected
            if len(faces) == 0:
                del loc_annotations[photo_id]
                continue

            # containers for the countinues variables
            ages = []
            smiling_coefs = []
            categorical_attributes = [u'gender', u'glass', u'race']

            for face in faces:
                # categorical variables
                for att in categorical_attributes:
                    att_dict = face[u'attribute'][att]
                    value = str(att_dict[u'value']).lower()

                    if u'confidence' in att_dict and att_dict[u'confidence'] < 90:
                        value = "NA"

                    feature = "%s_%s" % (att, value)
                    photo_dict[feature] = photo_dict.get(feature, 0) + 1

                # continuous variables (age and smiling)
                ages.append(face[u'attribute'][u'age'][u'value'])
                smiling_coefs.append(face[u'attribute'][u'smiling'][u'value'])

            # compute one-hot representations of the categorical variables
            photo_dict = factor_to_one_hot(photo_dict, "location", LOCATIONS)
            photo_dict = factor_to_one_hot(photo_dict, "room", ROOM_NAMES)

            # compute metrics for how close are ppl in the picture
            all_pairs_avg, mst_avg = face_closeness(faces)

            # aggregate continuous variables
            ages = np.array(ages)
            smiling_coefs = np.array(smiling_coefs)

            photo_dict["num_ppl"] = len(faces)

            photo_dict["age_mean"] = np.mean(ages)
            photo_dict["age_median"] = np.median(ages)
            photo_dict["age_std"] = np.std(ages)

            photo_dict["smile_coef_mean"] = np.mean(smiling_coefs)
            photo_dict["smile_coef_median"] = np.median(smiling_coefs)
            photo_dict["smile_coef_std"] = np.std(smiling_coefs)

            photo_dict["pos_all_pairs_avg"] = all_pairs_avg
            photo_dict["pos_mst_avg"] = mst_avg

            # compute entropies
            photo_dict["race_entropy"] = entropy(
                [
                    photo_dict.get("race_asian", 0.0),
                    photo_dict.get("race_black", 0.0),
                    photo_dict.get("race_white", 0.0)
                ]
            )
            photo_dict["gender_entropy"] = entropy(
                [
                    photo_dict.get("gender_male", 0.0),
                    photo_dict.get("gender_female", 0.0)
                ]
            )
            photo_dict["p_glass"] = photo_dict.get("glass_normal", 0.0) / float(photo_dict["num_ppl"])

            # set unseen categorical variable counts to 0
            attribute_values = {
                "gender": ["NA", "male", "female"],
                "glass": ["NA", "none", "normal", "dark"],
                "race": ["NA", "asian", "black", "white"]
            }

            for attribute, values in attribute_values.items():
                for value in values:
                    att = "%s_%s" % (attribute, value)
                    photo_dict[att] = photo_dict.get(att, 0)

            photo_dicts.append(photo_dict)
            fets = fets | set(photo_dict.keys())
            n_photos += 1

    print "FEATURES:", fets

    # generate a table
    fields = [
        "file_id",
        "location",
        'location_boston',
        'location_arizona',
        'location_texas',
        'location_nyc',
        "room",
        'room_theater',
        'room_dig',
        'room_office',
        'room_home',
        'room_apartment',
        'room_agency',
        "num_ppl",
        "pos_all_pairs_avg",
        "pos_mst_avg",
        "age_mean",
        "age_median",
        "age_std",
        "gender_NA",
        "gender_female",
        "gender_male",
        "gender_entropy",
        "glass_NA",
        "glass_dark",
        "glass_none",
        "glass_normal",
        "p_glass",
        "race_NA",
        "race_asian",
        "race_black",
        "race_white",
        "race_entropy",
        "smile_coef_mean",
        "smile_coef_median",
        "smile_coef_std"
    ]

    # output csv file
    df = pd.DataFrame(photo_dicts)
    df = df[fields]
    df.to_csv("../data/features.csv", na_rep="NA", float_format="%.3f", index=False)

    print "Done!"

########################################################################################################################

if __name__ == "__main__":
    main()

# END
